# content = '''
# <!DOCTYPE html>
# <html lang="en">
# <head>
#     <meta charset="UTF-8">
#     <meta http-equiv="X-UA-Compatible" content="IE=edge">
#     <meta name="viewport" content="width=device-width, initial-scale=1.0">
#     <title>Document</title>
#     <meta itemprop="haha" name='keywords' itemscope='test'>
#     <style>
#         #demo{
#             width: 100;
#             height: 100;
#             background-color: cornflowerblue;
#         }
#     </style>
#     <script>
#         var a=1;
#         console.log(a);
#     </script>
# </head>
# <body>
#     <div id="demo">这是一个测试</div>
#     <a href="http://www.baidu.com">
#         <img src="/demo.jpeg" alt="img">
#     </a>
#     <a href="http://www.bilibili.com"></a>
# </body>
# </html>
# '''

from lxml import etree,html
from pyquery import PyQuery as pq

def get_plain_text(content):
    doc = pq(content)
    doc.remove('script')
    doc.remove('style')

    tree = html.fromstring(str(doc))
    ele = tree.xpath('//script | //noscript')
    for e in ele:
        e.getparent().remove(e)
    Html = html.tostring(tree).decode()
    tree = etree.HTML(Html)
    total = str(tree.xpath('//text()')).replace(' ', '').replace("\\r\\n", '').replace('\\xa0', '').replace('\\n',
                                                                                                            '').replace(
        "'',", '').replace("''", '').lstrip('[').rstrip(']').replace("',", '').replace('\\t', '').replace("'",
                                                                                                          '').replace(
        '\\u3000', '')
    return total
